{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "OOP for ML.ipynb",
      "provenance": [],
      "authorship_tag": "ABX9TyPL5TktyjQDEFOnhOdmtCUn"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "MqOtKyjyVoCN"
      },
      "source": [
        "import pandas as pd"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Uu2MQuClV5Hl"
      },
      "source": [
        "traincsv = '/content/sample_data/california_housing_train.csv'\n",
        "\n",
        "testcsv = '/content/sample_data/california_housing_test.csv'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DtsRo3cy2d4O"
      },
      "source": [
        "### Update 0 : Getting comfortable/Revision of OOP."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vx-IFSqtXRpa"
      },
      "source": [
        "class Model:\n",
        "\n",
        "  def __init__(self , datafile):\n",
        "    self.df = pd.read_csv(datafile)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6Qxt0VYZXfPf"
      },
      "source": [
        "m_trn = Model(traincsv)\n",
        "df_trn = m_trn.df"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tlW2MTVaXjDs"
      },
      "source": [
        "m_tst = Model(testcsv)\n",
        "df_tst = m_tst.df"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 130
        },
        "id": "eY3WvvUuXnRn",
        "outputId": "4aa8d521-45fa-4126-9f09-4a3b1fe39fe7"
      },
      "source": [
        "df_trn.head(2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>longitude</th>\n",
              "      <th>latitude</th>\n",
              "      <th>housing_median_age</th>\n",
              "      <th>total_rooms</th>\n",
              "      <th>total_bedrooms</th>\n",
              "      <th>population</th>\n",
              "      <th>households</th>\n",
              "      <th>median_income</th>\n",
              "      <th>median_house_value</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-114.31</td>\n",
              "      <td>34.19</td>\n",
              "      <td>15.0</td>\n",
              "      <td>5612.0</td>\n",
              "      <td>1283.0</td>\n",
              "      <td>1015.0</td>\n",
              "      <td>472.0</td>\n",
              "      <td>1.4936</td>\n",
              "      <td>66900.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-114.47</td>\n",
              "      <td>34.40</td>\n",
              "      <td>19.0</td>\n",
              "      <td>7650.0</td>\n",
              "      <td>1901.0</td>\n",
              "      <td>1129.0</td>\n",
              "      <td>463.0</td>\n",
              "      <td>1.8200</td>\n",
              "      <td>80100.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   longitude  latitude  ...  median_income  median_house_value\n",
              "0    -114.31     34.19  ...         1.4936             66900.0\n",
              "1    -114.47     34.40  ...         1.8200             80100.0\n",
              "\n",
              "[2 rows x 9 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 130
        },
        "id": "TCiAaxd6XrFx",
        "outputId": "dd64c3f7-9070-407d-ec36-f80a1de780d8"
      },
      "source": [
        "df_tst.head(2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>longitude</th>\n",
              "      <th>latitude</th>\n",
              "      <th>housing_median_age</th>\n",
              "      <th>total_rooms</th>\n",
              "      <th>total_bedrooms</th>\n",
              "      <th>population</th>\n",
              "      <th>households</th>\n",
              "      <th>median_income</th>\n",
              "      <th>median_house_value</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-122.05</td>\n",
              "      <td>37.37</td>\n",
              "      <td>27.0</td>\n",
              "      <td>3885.0</td>\n",
              "      <td>661.0</td>\n",
              "      <td>1537.0</td>\n",
              "      <td>606.0</td>\n",
              "      <td>6.6085</td>\n",
              "      <td>344700.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-118.30</td>\n",
              "      <td>34.26</td>\n",
              "      <td>43.0</td>\n",
              "      <td>1510.0</td>\n",
              "      <td>310.0</td>\n",
              "      <td>809.0</td>\n",
              "      <td>277.0</td>\n",
              "      <td>3.5990</td>\n",
              "      <td>176500.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   longitude  latitude  ...  median_income  median_house_value\n",
              "0    -122.05     37.37  ...         6.6085            344700.0\n",
              "1    -118.30     34.26  ...         3.5990            176500.0\n",
              "\n",
              "[2 rows x 9 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XRlXwQ6jaCbU"
      },
      "source": [
        "## Update 1 : LinearRegression utility."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "s3Rvpl3oZIlL"
      },
      "source": [
        "class Model:\n",
        "\n",
        "  def __init__(self , datafile = traincsv):\n",
        "    self.df = pd.read_csv(datafile)\n",
        "    self.linear_reg = LinearRegression()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oosce7oraJk-"
      },
      "source": [
        "from sklearn.linear_model import LinearRegression\n",
        "m = Model()\n",
        "tr = m.df\n",
        "lr = m.linear_reg"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6blm5IjPupbx"
      },
      "source": [
        "## Update 2 : Train-Test-Split utility"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oWz-8QBUvE-j"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "import numpy as np"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JY51ux5vaR6l"
      },
      "source": [
        "class Model:\n",
        "\n",
        "  def __init__(self , datafile = traincsv):\n",
        "    self.df = pd.read_csv(datafile)\n",
        "    self.linear_reg = LinearRegression()\n",
        "\n",
        "  def split(self, output_column ,test_fraction):\n",
        "\n",
        "    X = np.array(self.df.drop(output_column, axis=1))\n",
        "    y = np.array(self.df[output_column])\n",
        "\n",
        "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction, random_state=42)\n",
        "\n",
        "    return X_train, X_test, y_train, y_test\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MdPv3Kv9atli"
      },
      "source": [
        "m = Model()\n",
        "\n",
        "df = m.df\n",
        "\n",
        "out= 'median_house_value'\n",
        "\n",
        "X_train, X_test, y_train, y_test = m.split(out, 0.2)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "x2zGUa8CvpTq",
        "outputId": "33a7adc5-7f5b-4ea2-c288-9e7b4f87d2af"
      },
      "source": [
        "X_train.shape , y_train.shape"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "((13600, 8), (13600,))"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LvzfbQv7zTfu"
      },
      "source": [
        "## Update 3 : Fitting and Predicting Utilities. \n",
        "\n",
        "> Fitting on Training, Predicting on Testing."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZAKM_EyfzCKB"
      },
      "source": [
        "class Model:\n",
        "\n",
        "  def __init__(self , datafile = traincsv):\n",
        "    self.df = pd.read_csv(datafile)\n",
        "    self.linear_reg = LinearRegression()\n",
        "\n",
        "  def split(self, output_column ,test_fraction):\n",
        "\n",
        "    X = np.array(self.df.drop(output_column, axis=1))\n",
        "    y = np.array(self.df[output_column])\n",
        "\n",
        "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction, random_state=42)\n",
        "\n",
        "    # return X_train, X_test, y_train, y_test\n",
        "\n",
        "  def fit(self):\n",
        "    self.model = self.linear_reg.fit(X_train , y_train)\n",
        "\n",
        "  def predict(self):\n",
        "    yp = self.linear_reg.predict(X_test)\n",
        "    return yp    "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "389VJKNP0DG4"
      },
      "source": [
        "m = Model()\n",
        "m.split(out, 0.2)\n",
        "m.fit()\n",
        "yp = m.predict()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7N2H1dtH1m3l"
      },
      "source": [
        "## Update 4 : Model Performance Utility"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EFnjt6580O31"
      },
      "source": [
        "from sklearn.metrics import mean_squared_error as mse"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BrTm90-m0R67"
      },
      "source": [
        "class Model:\n",
        "\n",
        "  def __init__(self , datafile = traincsv):\n",
        "    self.df = pd.read_csv(datafile)\n",
        "    self.linear_reg = LinearRegression()\n",
        "\n",
        "  def split(self, output_column ,test_fraction):\n",
        "\n",
        "    X = np.array(self.df.drop(output_column, axis=1))\n",
        "    y = np.array(self.df[output_column])\n",
        "\n",
        "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction, random_state=42)\n",
        "\n",
        "    # return X_train, X_test, y_train, y_test\n",
        "\n",
        "  def fit(self):\n",
        "    self.model = self.linear_reg.fit(X_train , y_train)\n",
        "\n",
        "  def predict(self):\n",
        "    yp = self.linear_reg.predict(X_test)\n",
        "    return yp    \n",
        "\n",
        "  def score(self):\n",
        "    rmse = np.sqrt(mse(y_test , yp))\n",
        "    return rmse"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cskJ6n_y0ZPn",
        "outputId": "1e360512-e449-40d7-8dac-098a9a72290c"
      },
      "source": [
        "m = Model()\n",
        "m.split(out, 0.2)\n",
        "m.fit()\n",
        "yp = m.predict()\n",
        "\n",
        "print(f'The models error is : {m.score()}')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "The models error is : 68078.32552452666\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3TD7PM4n2KDV"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}